Setup
Load libraries
library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)
# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
sequential:
- args: function (expr, envir = parent.frame(), substitute = TRUE, lazy = FALSE, seed = NULL, globals = TRUE, local = TRUE, earlySignal = FALSE, label = NULL, ...)
- tweaked: FALSE
- call: NULL
Process Human Data
import_remote_data <- function(file_url, type = "table", header = FALSE) {
con <- gzcon(url(file_url))
txt <- readLines(con)
if (type == "MM") { return (readMM(textConnection(txt))) }
if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"
human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
human_ret_seurat
An object of class Seurat
19712 features across 20091 samples within 1 assay
Active assay: RNA (19712 features, 0 variable features)
Process Mouse Data
mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data,
project = "mouse_ret",
min.cells = 3,
min.features = 200)
mouse_ret_seurat
An object of class Seurat
16424 features across 4510 samples within 1 assay
Active assay: RNA (16424 features, 0 variable features)
Process Primate Data
url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
auth_token = 'your_token' )
macaque_fovea_seurat
An object of class Seurat
30039 features across 111993 samples within 1 assay
Active assay: RNA (30039 features, 0 variable features)
Cleanup
rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)
Combine
# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)
# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
x <- NormalizeData(x, verbose = FALSE)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})
# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)
Integration
plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50, anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)
Integrated Analysis
plan("multiprocess", workers = 4)
DefaultAssay(ret.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)
UMAP Visualization
DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")

DimPlot(ret.combined, reduction = "umap", label = TRUE)

DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)

Identify Clusters with Canonical Markers
DefaultAssay(ret.combined) <- "RNA"
features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
"Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))
FeaturePlot(object = ret.combined,
features = features,
pt.size = 0.1,
cols = c("lightgrey", "#F26969"),
min.cutoff = "q9",
combine = TRUE) & NoLegend() & NoAxes()

# Cowplot method: make sure to change to "combine = FALSE" and remove "& NoLegend() & NoAxes"
# for(i in 1:length(p)) {
# p[[i]] <- p[[i]] + NoLegend() + NoAxes()
# }
#
# cowplot::plot_grid(plotlist = p, ncol=3)
- Rod : pde6a
- AC (amacrine cell) : gad1, slc6a9
- MG (Müller glia) : glul
- BC (bipolar cell) : Trpm, camk2b
- CC (cone cell) : gnat2, arr3
- RGC (retinal ganglial cell) : nefl, thy1
- VC (vascular cell) : mgp, tm4sf1
- M (microglia) : c1qa
- HC (horizontal cell) : sept4
Markers were determined from this paper and other sources.

Find Differentially Expressed Genes
cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())
cell_type_avg <- function(seurat.combined, ident) {
cells.x <- subset(seurat.combined, idents = ident)
Idents(cells.x) <- "orig.ident"
cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
cells.x.avg$gene <- rownames(cells.x.avg)
return(cells.x.avg)
}
cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
cells.x.avg <- cell_type_avg(ret.combined, ident = x)
x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
return(x)
})
# For individual plots
# for (p in cells.plot) {
# print(p)
# }
# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)

ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"
Tables with the most differentially expressed genes in each cell subtype:
for(i in seq_along(cells.diffgenes)) {
print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}
Rod
| ckb |
0 |
1.4493770 |
0.918 |
0.724 |
0 |
Inf |
Rod |
ckb |
| hsp90aa1 |
0 |
1.3457646 |
0.854 |
0.627 |
0 |
Inf |
Rod |
hsp90aa1 |
| nrl |
0 |
1.3140138 |
0.874 |
0.635 |
0 |
Inf |
Rod |
nrl |
| 0610009b22rik |
0 |
-0.6622860 |
0.000 |
0.130 |
0 |
Inf |
Rod |
0610009b22rik |
| gm17018 |
0 |
-0.6831275 |
0.000 |
0.130 |
0 |
Inf |
Rod |
gm17018 |
| spata1 |
0 |
-0.6929677 |
0.000 |
0.132 |
0 |
Inf |
Rod |
spata1 |
BC
| neat1 |
0 |
3.086391 |
0.793 |
0.064 |
0 |
Inf |
BC |
neat1 |
| mtch1 |
0 |
-1.305054 |
0.000 |
0.459 |
0 |
Inf |
BC |
mtch1 |
| selenom |
0 |
-1.338108 |
0.000 |
0.480 |
0 |
Inf |
BC |
selenom |
| araf |
0 |
-1.342891 |
0.013 |
0.494 |
0 |
Inf |
BC |
araf |
| klc3 |
0 |
-1.424615 |
0.002 |
0.500 |
0 |
Inf |
BC |
klc3 |
| pea15a |
0 |
-1.427543 |
0.000 |
0.500 |
0 |
Inf |
BC |
pea15a |
MG
| tf |
0 |
5.089073 |
0.962 |
0.000 |
0 |
Inf |
MG |
tf |
| spp1 |
0 |
3.879036 |
0.847 |
0.003 |
0 |
Inf |
MG |
spp1 |
| crabp1 |
0 |
3.865908 |
0.876 |
0.028 |
0 |
Inf |
MG |
crabp1 |
| gpx3 |
0 |
3.736219 |
0.869 |
0.052 |
0 |
Inf |
MG |
gpx3 |
| ftl |
0 |
3.672007 |
0.877 |
0.000 |
0 |
Inf |
MG |
ftl |
| actg1 |
0 |
3.639157 |
0.905 |
0.026 |
0 |
Inf |
MG |
actg1 |
RGC
| malat1 |
0e+00 |
-5.358199 |
0.000 |
1.000 |
0.0000020 |
23.73532 |
RGC |
malat1 |
| gm42418 |
0e+00 |
-5.948091 |
0.000 |
0.957 |
0.0000083 |
22.29749 |
RGC |
gm42418 |
| ay036118 |
0e+00 |
-2.770196 |
0.000 |
0.826 |
0.0004565 |
18.29374 |
RGC |
ay036118 |
| neat1 |
0e+00 |
5.237230 |
0.889 |
0.087 |
0.0007506 |
17.79653 |
RGC |
neat1 |
| linc00599 |
1e-07 |
2.669694 |
0.815 |
0.000 |
0.0024960 |
16.59495 |
RGC |
linc00599 |
| kcnq1ot1 |
1e-07 |
2.290579 |
0.926 |
0.261 |
0.0025965 |
16.55548 |
RGC |
kcnq1ot1 |
CC
| gm42418 |
0 |
-5.444663 |
0.000 |
1.000 |
0 |
185.7925 |
CC |
gm42418 |
| malat1 |
0 |
-5.893437 |
0.000 |
1.000 |
0 |
185.7925 |
CC |
malat1 |
| ay036118 |
0 |
-2.515711 |
0.000 |
0.943 |
0 |
170.6009 |
CC |
ay036118 |
| gnat2 |
0 |
-2.832927 |
0.117 |
0.943 |
0 |
153.8304 |
CC |
gnat2 |
| mir124a-1hg |
0 |
-2.420164 |
0.000 |
0.869 |
0 |
151.9125 |
CC |
mir124a-1hg |
| gngt2 |
0 |
-3.274161 |
0.143 |
0.937 |
0 |
149.3963 |
CC |
gngt2 |
AC
| gm42418 |
0 |
-5.868449 |
0 |
1.000 |
0 |
258.1168 |
AC |
gm42418 |
| malat1 |
0 |
-6.139436 |
0 |
0.994 |
0 |
255.9481 |
AC |
malat1 |
| ay036118 |
0 |
-2.993976 |
0 |
0.935 |
0 |
236.7899 |
AC |
ay036118 |
| snhg11 |
0 |
-3.730037 |
0 |
0.916 |
0 |
230.5453 |
AC |
snhg11 |
| ac149090.1 |
0 |
-2.375670 |
0 |
0.729 |
0 |
173.6489 |
AC |
ac149090.1 |
| c230004f18rik |
0 |
-2.693746 |
0 |
0.729 |
0 |
173.6488 |
AC |
c230004f18rik |
VC
| hla-b |
0 |
3.429125 |
0.884 |
0.00 |
0 |
112.97262 |
VC |
hla-b |
| rps3a |
0 |
2.938618 |
0.826 |
0.00 |
0 |
104.06885 |
VC |
rps3a |
| hla-e |
0 |
3.220179 |
0.826 |
0.00 |
0 |
104.06878 |
VC |
hla-e |
| hla-a |
0 |
3.030967 |
0.812 |
0.00 |
0 |
101.88369 |
VC |
hla-a |
| hla-c |
0 |
2.913989 |
0.797 |
0.00 |
0 |
99.71476 |
VC |
hla-c |
| a2m |
0 |
3.322554 |
0.797 |
0.01 |
0 |
96.28914 |
VC |
a2m |
HC
| gm42418 |
0 |
-6.046691 |
0.000 |
1.000 |
0 |
108.45517 |
HC |
gm42418 |
| malat1 |
0 |
-5.904074 |
0.000 |
0.942 |
0 |
100.35094 |
HC |
malat1 |
| neat1 |
0 |
4.028649 |
0.967 |
0.043 |
0 |
70.98094 |
HC |
neat1 |
| ay036118 |
0 |
-2.922930 |
0.000 |
0.710 |
0 |
70.65422 |
HC |
ay036118 |
| pvalb |
0 |
3.437120 |
0.902 |
0.000 |
0 |
63.70192 |
HC |
pvalb |
| gpi1 |
0 |
-2.385417 |
0.000 |
0.580 |
0 |
55.70038 |
HC |
gpi1 |
M
| ftl |
0 |
4.612295 |
0.98 |
0 |
0 |
80.75030 |
M |
ftl |
| hla-dra |
0 |
4.664191 |
0.94 |
0 |
0 |
76.57279 |
M |
hla-dra |
| hla-a |
0 |
2.899218 |
0.94 |
0 |
0 |
76.57279 |
M |
hla-a |
| hla-drb1 |
0 |
4.096260 |
0.92 |
0 |
0 |
74.52028 |
M |
hla-drb1 |
| rps3a |
0 |
3.397725 |
0.92 |
0 |
0 |
74.52028 |
M |
rps3a |
| hla-b |
0 |
3.163581 |
0.92 |
0 |
0 |
74.52028 |
M |
hla-b |
Save as csv files
for(i in seq_along(cells.diffgenes)) {
write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
print(FeaturePlot(object = ret.combined,
features = rownames(cells.diffgenes[[i]])[1:genes_to_plot],
split.by = "orig.ident",
max.cutoff = 3,
cols = c("grey", "red"),
pt.size = 0.07,
combine = TRUE,
label.size = 0.5
) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
)
}









Check cell proportion for each species:
knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
| 0 |
0.2627047 |
0.1535810 |
0.2875831 |
| 1 |
0.5498980 |
0.0758172 |
0.3164080 |
| 2 |
0.0001493 |
0.1855205 |
0.0002217 |
| 3 |
0.0009955 |
0.1458216 |
0.0035477 |
| 4 |
0.0531581 |
0.0808176 |
0.0838137 |
| 5 |
0.0114977 |
0.0783888 |
0.0388027 |
| 6 |
0.0406650 |
0.0552267 |
0.0569845 |
| 7 |
0.0187148 |
0.0577625 |
0.0343681 |
| 8 |
0.0484794 |
0.0443242 |
0.0917960 |
| 9 |
0.0001493 |
0.0342075 |
0.0000000 |
| 10 |
0.0000498 |
0.0232247 |
0.0004435 |
| 11 |
0.0076154 |
0.0156081 |
0.0152993 |
| 12 |
0.0000000 |
0.0117775 |
0.0000000 |
| 13 |
0.0034344 |
0.0089827 |
0.0436807 |
| 14 |
0.0000000 |
0.0109203 |
0.0000000 |
| 15 |
0.0000000 |
0.0101435 |
0.0008869 |
| 16 |
0.0024887 |
0.0039645 |
0.0261641 |
| 17 |
0.0000000 |
0.0039110 |
0.0000000 |
Gene Enrichment Analysis

plot_enrichment <- function(type = "Rod", other = "") {
file_path <- sprintf("enrich_data/%s_%s.txt", type, other)
x <- read.table(file_path, header=T, sep="\t", skip = 11)
colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
x <- x[order(x$FDR),]
x <- x[1:10,]
g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) +
geom_point(aes(size=Count)) +
theme_bw() +
theme(panel.background = element_rect(fill = NA) ,
axis.ticks.x=element_blank(),
axis.text.y = element_text(size = 12) ,
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(x = " ", y =" ") +
scale_colour_gradient(low = "red", high = "blue") +
ggtitle(type)
return(g)
}
for (type in cells.types) {
print(plot_enrichment(type = type, other= "top200"))
rm(type)
}









---
title: "Integrating Primate Data into Analysis"
output: 
  html_notebook:
    toc: true
---
# Setup
Load libraries
```{r message=FALSE, warning=FALSE}
library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)

# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
```
Process Human Data
```{r}
import_remote_data <- function(file_url, type = "table", header = FALSE) {
  con <- gzcon(url(file_url))
  txt <- readLines(con)
  if (type == "MM") { return (readMM(textConnection(txt))) }
  if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"

human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
```
```{r}
rownames(human.count_matrix) <- tolower(human.gene_names[,1])
colnames(human.count_matrix) <- tolower(human.sample_annotations[,1])

human_ret_seurat <- CreateSeuratObject(counts = human.count_matrix, 
                                       meta.data = human.sample_annotations, 
                                       project = "human_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
human_ret_seurat
```

Process Mouse Data
```{r}
mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data, 
                                       project = "mouse_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
mouse_ret_seurat
```

Process Primate Data
```{bash}
url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
```
```{r}
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
                auth_token = 'your_token' )
```
```{r}
load("primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata")

dimnames(Count.mat_fovea)[[1]] <- tolower(dimnames(Count.mat_fovea)[[1]])
macaque_fovea_seurat <- CreateSeuratObject(Count.mat_fovea,
                                           project = "macaque_fovea", 
                                           min.cells = 3, 
                                           min.features = 200)

# give macaque dta uniform name in "orig.ident" metadata column
AddMetaData(macaque_fovea_seurat, 
            metadata = macaque_fovea_seurat[["orig.ident"]], 
            col.name = "orig.sample.name")
macaque_fovea_seurat[["orig.ident"]] <- "macaque_fovea"

macaque_fovea_seurat
```
Cleanup
```{r}
rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)
```


Combine
```{r}
# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)

# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
    x <- NormalizeData(x, verbose = FALSE)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})

# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)
```

# Integration
```{r}
plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50,  anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)
```

# Integrated Analysis
```{r}
plan("multiprocess", workers = 4)

DefaultAssay(ret.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)
```
# UMAP Visualization
```{r warning=FALSE}
DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")
DimPlot(ret.combined, reduction = "umap", label = TRUE)
```
```{r, fig.height = 4, fig.width = 3}
DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)
```

# Identify Clusters with Canonical Markers
```{r}
DefaultAssay(ret.combined) <- "RNA"

features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
                      "Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))

FeaturePlot(object = ret.combined, 
            features = features, 
            pt.size = 0.1,
            cols = c("lightgrey", "#F26969"),
            min.cutoff = "q9",
            combine = TRUE) & NoLegend() & NoAxes()
```

* Rod : pde6a
* AC (amacrine cell) : gad1, slc6a9
* MG (Müller glia) : glul
* BC (bipolar cell) : Trpm, camk2b
* CC (cone cell) : gnat2, arr3
* RGC (retinal ganglial cell) : nefl, thy1
* VC (vascular cell) : mgp, tm4sf1
* M (microglia) : c1qa
* HC (horizontal cell) : sept4

Markers were determined from [this](https://www.nature.com/articles/s41467-019-12780-8) paper and other sources.
```{r}
ret.combined <- RenameIdents(ret.combined, `0` = "MG", `1` = "Rod", `2` = "RGC", 
    `3` = "RGC", `4` = "BC", `5` = "CC", `6` = "BC", `7` = "AC", `8` = "BC", `9` = "RGC", 
    `10` = "RGC", `11`= "HC", `12` = "MG", `13` = "VC", `14` = "RGC", `15` = "RGC", `16` = "M", `17` = "RGC")

DimPlot(ret.combined, label = TRUE)
```


# Find Differentially Expressed Genes
```{r}
cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())

cell_type_avg <- function(seurat.combined, ident) {
  cells.x <- subset(seurat.combined, idents = ident)
  Idents(cells.x) <- "orig.ident"
  cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
  cells.x.avg$gene <- rownames(cells.x.avg)
  return(cells.x.avg)
}

cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
  cells.x.avg <- cell_type_avg(ret.combined, ident = x)
  x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
  return(x)
})

# For individual plots
# for (p in cells.plot) {
#   print(p)
# }

# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)
```
```{r}
ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"
```
```{r}
cells.diffgenes <- as.list(cells.types)
cells.diffgenes <- lapply(cells.diffgenes, FUN = function(x) {
  lab_human <- sprintf("%s_human_ret", x)
  lab_mouse <- sprintf("%s_mouse_ret", x)
  return(FindMarkers(ret.combined, ident.1 = lab_human, ident.2 = lab_mouse))
})


for(i in seq_along(cells.diffgenes)) {
	x <- cells.diffgenes[[i]]
	x <- cbind(x, logp = -log(x$p_val), types = cells.types[[i]], genes = rownames(x))
	x <- x[!grepl("mt-", x$genes),] # remove mitochondrial genes
	cells.diffgenes[[i]] <- x
	rm(x)
}
```
Tables with the most differentially expressed genes in each cell subtype:
```{r}
for(i in seq_along(cells.diffgenes)) {
  print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}
```
Save as csv files
```{r}
for(i in seq_along(cells.diffgenes)) {
  write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
```

```{r warning=FALSE}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
  print(FeaturePlot(object = ret.combined, 
              features = rownames(cells.diffgenes[[i]])[1:genes_to_plot], 
              split.by = "orig.ident", 
              max.cutoff = 3, 
              cols = c("grey", "red"),
              pt.size = 0.07,
              combine = TRUE,
              label.size = 0.5
              ) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
        )
}
```

Check cell proportion for each species:
```{r}
knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
```

# Gene Enrichment Analysis
```{r}
library(ggplot2)
library(ggrepel)
library(scales)
library(data.table)
cells.diffgenes.combined <- rbindlist(cells.diffgenes)

# Preprocessing
# for(i in seq_along(cells.diffgenes.combined)) {
# 	if (cells.diffgenes.combined[i]$logp > 750) {
# 		cells.diffgenes.combined[i]$logp <- 749
# 	}
# 	if (cells.diffgenes.combined[i]$avg_logFC > 3) {
# 		cells.diffgenes.combined[i]$avg_logFC <- 2.99
# 	}
# }
# 
# cells.diffgenes.combined$logp <- gsub("Inf", 749, cells.diffgenes.combined$logp)

ggplot(data=cells.diffgenes.combined, 
		   aes(x=avg_logFC,y=logp, colour=types, label = genes)) + 
	geom_point(size=0.2) + 
	theme_bw() + 
	theme(panel.background = element_rect(fill = NA), 
		  axis.ticks.x = element_blank(),  
		  axis.text.y = element_text(size = 12), 
		  panel.grid.major = element_blank(), 
		  panel.grid.minor = element_blank()) + 
	labs(x = "log2(Fold changes)\n(3K/WT)", y ="-log10(p value)") +
	scale_x_continuous(limits=c(-3, 3)) +
	scale_y_continuous(limits=c(1, 750)) +
	geom_hline(yintercept= 1, colour="grey", linetype="dashed", size=0.7 ) +
	geom_vline(xintercept= 0 , colour="grey",   size=0.7)

```

```{r, fig.height = 4, fig.width = 6, dpi = 400, warning=FALSE}
plot_enrichment <- function(type = "Rod", other = "") {
	file_path <- sprintf("enrich_data/%s_%s.txt", type, other)
	x <- read.table(file_path, header=T, sep="\t", skip = 11)
	colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
	colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
	colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
	colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
	x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
	x <- x[order(x$FDR),]
	x <- x[1:10,]
	g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) + 
		geom_point(aes(size=Count)) + 
		theme_bw() +
		theme(panel.background = element_rect(fill = NA) , 
			  axis.ticks.x=element_blank(), 
			  axis.text.y = element_text(size = 12) , 
			  panel.grid.major = element_blank(), 
			  panel.grid.minor = element_blank()) + 
		labs(x = " ", y =" ") +
		scale_colour_gradient(low = "red", high = "blue") +
		ggtitle(type)
	return(g)
}

for (type in cells.types) {
	print(plot_enrichment(type = type, other= "top200"))
	rm(type)
}
```

